1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

About Dataset This dataset is a copy of another Kaggle dataset which can be accessed here: https://www.kaggle.com/c/glioma-radiomics The difference is that I have provided ground truth for the test set (test_GT.csv).

The suffix “omics” in Medical Science is associated with analysis of big sets of features (e.g. Genomics, Proteomics). Radiomics are imaging features (e.g., first order and second order) extracted from Regions of Interest (ROI) in radiology images based on predefined functions and filters. Low grade gliomas (LGG) are a type of brain tumors. Astrocytes and Oligodendrocytes which are two types of brain cells, are considered as origins of LGG. Adult LGG are characterized by different mutations which is important to be correctly identified. With this dataset the goal is to determine if an ROI has 1p19q codeletion (Mutacion=1) or not(Mutacion=0). This plays a key role in predicting patient’s response to chemotherapy and their survival. The dataset provides 640 different radiomics features for each ROI. There are 105 ROIs in the training set and 45 ROIs in the test cohort.

1.2 The Data

https://www.kaggle.com/datasets/knamdar/radiomics-for-lgg-dataset?select=test_GT.csv


LGG_Data <- read.csv("~/GitHub/LatentBiomarkers/Data/LGG/train.csv")
LGG_DataTest <- read.csv("~/GitHub/LatentBiomarkers/Data/LGG/test.csv")
LGG_TestGT <- read.csv("~/GitHub/LatentBiomarkers/Data/LGG/test_GT.csv")
LGG_DataTest$Mutacion <- LGG_TestGT$Mutacion

LGG_Data <- rbind(LGG_Data,LGG_DataTest)

rownames(LGG_Data) <- LGG_Data$patientID
LGG_Data$patientID <- NULL

pander::pander(table(LGG_Data$Mutacion))
0 1
54 96

1.2.0.1 Standarize the names for the reporting

studyName <- "LGG"
dataframe <- LGG_Data
outcome <- "Mutacion"

TopVariables <- 10

thro <- 0.80
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
150 640
pander::pander(table(dataframe[,outcome]))
0 1
54 96

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) > 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9999991

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 633 , Uni p: 0.02486647 , Uncorrelated Base: 57 , Outcome-Driven Size: 0 , Base Size: 57 
#> 
#> 
 1 <R=1.000,r=0.975,N=  394>, Top: 93( 3 )[ 1 : 93 Fa= 91 : 0.975 ]( 91 , 235 , 0 ),<|>Tot Used: 326 , Added: 235 , Zero Std: 0 , Max Cor: 1.000
#> 
 2 <R=1.000,r=0.975,N=  394>, Top: 40( 1 )[ 1 : 40 Fa= 131 : 0.975 ]( 40 , 76 , 91 ),<|>Tot Used: 366 , Added: 76 , Zero Std: 0 , Max Cor: 1.000
#> 
 3 <R=1.000,r=0.975,N=  394>, Top: 4( 3 )[ 1 : 4 Fa= 135 : 0.975 ]( 4 , 13 , 131 ),<|>Tot Used: 377 , Added: 13 , Zero Std: 0 , Max Cor: 0.983
#> 
 4 <R=0.983,r=0.941,N=  202>, Top: 71( 1 )[ 1 : 71 Fa= 165 : 0.941 ]( 68 , 104 , 135 ),<|>Tot Used: 453 , Added: 104 , Zero Std: 0 , Max Cor: 0.998
#> 
 5 <R=0.998,r=0.949,N=  202>, Top: 7( 1 )[ 1 : 7 Fa= 168 : 0.949 ]( 7 , 10 , 165 ),<|>Tot Used: 456 , Added: 10 , Zero Std: 0 , Max Cor: 0.954
#> 
 6 <R=0.954,r=0.877,N=  188>, Top: 69( 1 )=[ 2 : 69 Fa= 189 : 0.930 ]( 66 , 90 , 168 ),<|>Tot Used: 500 , Added: 90 , Zero Std: 0 , Max Cor: 0.977
#> 
 7 <R=0.977,r=0.888,N=  188>, Top: 20( 1 )[ 1 : 20 Fa= 196 : 0.888 ]( 20 , 24 , 189 ),<|>Tot Used: 507 , Added: 24 , Zero Std: 0 , Max Cor: 0.899
#> 
 8 <R=0.899,r=0.849,N=  188>, Top: 49( 1 )[ 1 : 49 Fa= 213 : 0.849 ]( 49 , 57 , 196 ),<|>Tot Used: 533 , Added: 57 , Zero Std: 0 , Max Cor: 1.000
#> 
 9 <R=1.000,r=0.900,N=  188>, Top: 8( 1 )[ 1 : 8 Fa= 217 : 0.900 ]( 8 , 8 , 213 ),<|>Tot Used: 535 , Added: 8 , Zero Std: 0 , Max Cor: 0.992
#> 
 10 <R=0.992,r=0.846,N=   28>, Top: 14( 1 )[ 1 : 14 Fa= 221 : 0.846 ]( 14 , 14 , 217 ),<|>Tot Used: 539 , Added: 14 , Zero Std: 0 , Max Cor: 0.992
#> 
 11 <R=0.992,r=0.846,N=   28>, Top: 3( 1 )[ 1 : 3 Fa= 222 : 0.846 ]( 3 , 3 , 221 ),<|>Tot Used: 539 , Added: 3 , Zero Std: 0 , Max Cor: 0.992
#> 
 12 <R=0.992,r=0.846,N=   28>, Top: 1( 1 )[ 1 : 1 Fa= 222 : 0.846 ]( 1 , 1 , 222 ),<|>Tot Used: 539 , Added: 1 , Zero Std: 0 , Max Cor: 0.992
#> 
 13 <R=0.992,r=0.800,N=  146>, Top: 55( 1 )[ 1 : 55 Fa= 237 : 0.800 ]( 54 , 77 , 222 ),<|>Tot Used: 563 , Added: 77 , Zero Std: 0 , Max Cor: 0.992
#> 
 14 <R=0.992,r=0.800,N=  146>, Top: 15( 1 )[ 1 : 15 Fa= 238 : 0.800 ]( 15 , 16 , 237 ),<|>Tot Used: 564 , Added: 16 , Zero Std: 0 , Max Cor: 0.992
#> 
 15 <R=0.992,r=0.800,N=  146>, Top: 6( 1 )[ 1 : 6 Fa= 239 : 0.800 ]( 6 , 7 , 238 ),<|>Tot Used: 564 , Added: 7 , Zero Std: 0 , Max Cor: 0.992
#> 
 16 <R=0.992,r=0.800,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 239 : 0.800 ]( 1 , 1 , 239 ),<|>Tot Used: 564 , Added: 1 , Zero Std: 0 , Max Cor: 0.992
#> 
 17 <R=0.992,r=0.800,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 239 : 0.800 ]( 1 , 1 , 239 ),<|>Tot Used: 564 , Added: 1 , Zero Std: 0 , Max Cor: 0.992
#> 
 18 <R=0.992,r=0.800,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 239 : 0.800 ]( 1 , 1 , 239 ),<|>Tot Used: 564 , Added: 1 , Zero Std: 0 , Max Cor: 0.992
#> 
 19 <R=0.992,r=0.800,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 239 : 0.800 ]( 1 , 1 , 239 ),<|>Tot Used: 564 , Added: 1 , Zero Std: 0 , Max Cor: 0.992
#> 
 20 <R=0.992,r=0.800,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 239 : 0.800 ]( 1 , 1 , 239 ),<|>Tot Used: 564 , Added: 1 , Zero Std: 0 , Max Cor: 0.992
#> 
 21 <R=0.992,r=0.800,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 239 : 0.800 ]( 1 , 1 , 239 ),<|>Tot Used: 564 , Added: 1 , Zero Std: 0 , Max Cor: 0.992
#> 
 22 <R=0.992,r=0.800,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 239 : 0.800 ]( 1 , 1 , 239 ),<|>Tot Used: 564 , Added: 1 , Zero Std: 0 , Max Cor: 0.992
#> 
 23 <R=0.992,r=0.800,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 239 : 0.800 ]( 1 , 1 , 239 ),<|>Tot Used: 564 , Added: 1 , Zero Std: 0 , Max Cor: 0.989
#> 
 24 <R=0.989,r=0.800,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 239 : 0.800 ]( 1 , 1 , 239 ),<|>Tot Used: 564 , Added: 1 , Zero Std: 0 , Max Cor: 0.833
#> 
 25 <R=0.833,r=0.800,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 239 : 0.800 ]( 1 , 1 , 239 ),<|>Tot Used: 564 , Added: 1 , Zero Std: 0 , Max Cor: 0.800
#> 
 26 <R=0.800,r=0.800,N=    2>
#> 
 [ 26 ], 0.7999294 Decor Dimension: 564 Nused: 564 . Cor to Base: 220 , ABase: 19 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

6.77e+22

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

6.77e+22

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

0.000131

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

0.000131

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.7999294

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data


if (nrow(dataframe) < 1000)
{
  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  datasetframe.umap = umap(scale(dataframe[1:numsub,varlist]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
}

1.8.2 The decorralted UMAP

if (nrow(dataframe) < 1000)
{

  datasetframe.umap = umap(scale(DEdataframe[1:numsub,varlistc]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : wavelet.HHH_firstorder_10Percentile 200 : wavelet.HHL_firstorder_Uniformity 300 : wavelet.HLH_glcm_Imc1 400 : wavelet.HLL_gldm_GrayLevelVariance 500 : wavelet.LHH_glrlm_LongRunLowGrayLevelEmphasis
600 : wavelet.LHL_glszm_LargeAreaHighGrayLevelEmphasis




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : La_wavelet.HHH_firstorder_10Percentile 200 : La_wavelet.HHL_firstorder_Uniformity 300 : La_wavelet.HLH_glcm_Imc1 400 : La_wavelet.HLL_gldm_GrayLevelVariance 500 : La_wavelet.LHH_glrlm_LongRunLowGrayLevelEmphasis
600 : La_wavelet.LHL_glszm_LargeAreaHighGrayLevelEmphasis

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
original_firstorder_Skewness -3.93e-02 5.28e-01 -6.84e-01 6.34e-01 9.78e-01 0.780
original_glcm_ClusterShade 7.42e+02 1.02e+04 -1.06e+04 1.65e+04 4.41e-02 0.755
wavelet.HLL_glcm_ClusterShade -7.99e+02 3.12e+03 2.32e+03 4.29e+03 4.72e-02 0.737
wavelet.HLL_firstorder_Skewness -1.05e-02 5.50e-01 5.21e-01 8.55e-01 5.54e-02 0.736
original_firstorder_Median 2.77e+02 5.52e+01 3.33e+02 8.63e+01 6.29e-01 0.732
original_glcm_MaximumProbability 6.19e-03 2.32e-03 1.02e-02 6.26e-03 7.10e-02 0.722
original_gldm_LargeDependenceHighGrayLevelEmphasis 1.36e+04 9.87e+03 3.19e+04 3.44e+04 2.54e-03 0.711
original_glcm_JointEnergy 1.72e-03 7.60e-04 2.89e-03 2.23e-03 1.21e-02 0.710
original_firstorder_Mean 2.75e+02 5.33e+01 3.20e+02 7.86e+01 3.40e-01 0.702
original_glszm_LargeAreaHighGrayLevelEmphasis 2.34e+04 4.63e+04 1.77e+05 7.31e+05 1.50e-08 0.700


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
original_firstorder_Skewness -0.0393 5.28e-01 -6.84e-01 6.34e-01 0.9782 0.780
La_wavelet.HLL_glszm_SmallAreaHighGrayLevelEmphasis -55.1442 5.85e+01 2.27e+01 1.23e+02 0.0420 0.762
La_wavelet.LHL_glszm_GrayLevelNonUniformity 9.5446 7.66e+00 2.05e+00 6.86e+00 0.5629 0.761
original_glcm_ClusterShade 741.5044 1.02e+04 -1.06e+04 1.65e+04 0.0441 0.755
La_wavelet.HLL_firstorder_RobustMeanAbsoluteDeviation -0.0545 7.68e-01 6.93e-01 1.20e+00 0.0869 0.746
La_wavelet.LHL_firstorder_10Percentile 89.3291 5.91e+00 8.29e+01 8.62e+00 0.1136 0.744
wavelet.HLL_glcm_ClusterShade -799.0582 3.12e+03 2.32e+03 4.29e+03 0.0472 0.737
wavelet.HLL_firstorder_Skewness -0.0105 5.50e-01 5.21e-01 8.55e-01 0.0554 0.736
La_wavelet.HLL_gldm_DependenceNonUniformity 187.7064 1.34e+02 6.86e+01 1.35e+02 0.7199 0.734
La_wavelet.HLL_gldm_HighGrayLevelEmphasis -23.1086 3.90e+01 -6.66e+01 6.62e+01 0.3168 0.731

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.53 504 0.796


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
original_firstorder_Skewness -3.93e-02 5.28e-01 -6.84e-01 6.34e-01 0.97819 0.780 0.780 NA
La_wavelet.HLL_glszm_SmallAreaHighGrayLevelEmphasis -0.428wavelet.HLL_gldm_SmallDependenceHighGrayLevelEmphasis -0.559wavelet.HLL_glszm_HighGrayLevelZoneEmphasis + 1.000*wavelet.HLL_glszm_SmallAreaHighGrayLevelEmphasis -5.51e+01 5.85e+01 2.27e+01 1.23e+02 0.04203 0.762 0.517 -2
La_wavelet.LHL_glszm_GrayLevelNonUniformity -0.000original_firstorder_Energy + 1.000wavelet.LHL_glszm_GrayLevelNonUniformity 9.54e+00 7.66e+00 2.05e+00 6.86e+00 0.56290 0.761 0.607 0
original_glcm_ClusterShade 7.42e+02 1.02e+04 -1.06e+04 1.65e+04 0.04409 0.755 0.755 NA
La_wavelet.HLL_firstorder_RobustMeanAbsoluteDeviation -0.429wavelet.HLL_firstorder_InterquartileRange + 1.000wavelet.HLL_firstorder_RobustMeanAbsoluteDeviation -5.45e-02 7.68e-01 6.93e-01 1.20e+00 0.08690 0.746 0.511 -1
La_wavelet.LHL_firstorder_10Percentile + 1.000wavelet.LHL_firstorder_10Percentile + 33.267wavelet.LHL_glcm_DifferenceEntropy 8.93e+01 5.91e+00 8.29e+01 8.62e+00 0.11364 0.744 0.539 -1
wavelet.HLL_glcm_ClusterShade -7.99e+02 3.12e+03 2.32e+03 4.29e+03 0.04725 0.737 0.737 NA
wavelet.HLL_firstorder_Skewness -1.05e-02 5.50e-01 5.21e-01 8.55e-01 0.05544 0.736 0.736 NA
La_wavelet.HLL_gldm_DependenceNonUniformity -0.000original_firstorder_Energy + 1.000wavelet.HLL_gldm_DependenceNonUniformity 1.88e+02 1.34e+02 6.86e+01 1.35e+02 0.71994 0.734 0.591 5
La_wavelet.HLL_gldm_HighGrayLevelEmphasis + 1.000wavelet.HLL_gldm_HighGrayLevelEmphasis -0.996wavelet.HLL_glszm_HighGrayLevelZoneEmphasis -2.31e+01 3.90e+01 -6.66e+01 6.62e+01 0.31681 0.731 0.503 0
wavelet.LHL_glszm_GrayLevelNonUniformity NA 3.16e+01 2.24e+01 2.66e+01 2.39e+01 0.00886 0.607 0.607 NA
wavelet.HLL_gldm_DependenceNonUniformity NA 5.25e+02 3.58e+02 4.44e+02 3.58e+02 0.01973 0.591 0.591 NA
wavelet.LHL_glcm_DifferenceEntropy NA 4.24e+00 3.76e-01 4.13e+00 4.87e-01 0.74144 0.571 0.571 NA
wavelet.HLL_firstorder_InterquartileRange NA 4.91e+01 1.34e+01 4.70e+01 1.73e+01 0.80389 0.543 0.543 NA
wavelet.LHL_firstorder_10Percentile NA -5.16e+01 1.40e+01 -5.44e+01 1.89e+01 0.53482 0.539 0.539 NA
wavelet.HLL_glszm_SmallAreaHighGrayLevelEmphasis NA 2.47e+03 2.03e+03 2.75e+03 2.46e+03 0.10030 0.517 0.517 NA
wavelet.HLL_firstorder_RobustMeanAbsoluteDeviation NA 2.10e+01 5.75e+00 2.09e+01 7.69e+00 0.77709 0.511 0.511 NA
wavelet.HLL_glszm_HighGrayLevelZoneEmphasis NA 3.22e+03 2.55e+03 3.48e+03 2.98e+03 0.15442 0.511 0.511 NA
wavelet.HLL_gldm_HighGrayLevelEmphasis NA 3.18e+03 2.55e+03 3.40e+03 2.96e+03 0.13464 0.503 0.503 NA
original_firstorder_Energy NA 6.49e+08 5.58e+08 7.22e+08 7.48e+08 0.02189 0.499 0.499 5
wavelet.HLL_gldm_SmallDependenceHighGrayLevelEmphasis NA 1.69e+03 1.50e+03 1.83e+03 1.79e+03 0.02014 0.496 0.496 NA

1.10 Comparing IDeA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,tol=0.002)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 46 8
1 9 87
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.6333 0.5508 0.710
    tp 0.6400 0.5577 0.717
    se 0.9062 0.8295 0.956
    sp 0.8519 0.7288 0.934
    diag.ac 0.8867 0.8248 0.933
    diag.or 55.5833 20.0995 153.711
    nndx 1.3191 1.1236 1.791
    youden 0.7581 0.5583 0.890
    pv.pos 0.9158 0.8408 0.963
    pv.neg 0.8364 0.7120 0.922
    lr.pos 6.1172 3.2165 11.634
    lr.neg 0.1101 0.0585 0.207
    p.rout 0.3667 0.2896 0.449
    p.rin 0.6333 0.5508 0.710
    p.tpdn 0.1481 0.0662 0.271
    p.tndp 0.0938 0.0438 0.171
    p.dntp 0.0842 0.0371 0.159
    p.dptn 0.1636 0.0777 0.288
  • tab:

      Outcome + Outcome - Total
    Test + 87 8 95
    Test - 9 46 55
    Total 96 54 150
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.887 0.825 0.933
3 se 0.906 0.829 0.956
4 sp 0.852 0.729 0.934
6 diag.or 55.583 20.099 153.711

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe,control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="IDeA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 44 10
1 8 88
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.6533 0.5714 0.729
    tp 0.6400 0.5577 0.717
    se 0.9167 0.8424 0.963
    sp 0.8148 0.6857 0.907
    diag.ac 0.8800 0.8170 0.927
    diag.or 48.4000 17.8474 131.255
    nndx 1.3671 1.1484 1.894
    youden 0.7315 0.5281 0.871
    pv.pos 0.8980 0.8203 0.950
    pv.neg 0.8462 0.7192 0.931
    lr.pos 4.9500 2.8198 8.689
    lr.neg 0.1023 0.0520 0.201
    p.rout 0.3467 0.2709 0.429
    p.rin 0.6533 0.5714 0.729
    p.tpdn 0.1852 0.0925 0.314
    p.tndp 0.0833 0.0367 0.158
    p.dntp 0.1020 0.0500 0.180
    p.dptn 0.1538 0.0688 0.281
  • tab:

      Outcome + Outcome - Total
    Test + 88 10 98
    Test - 8 44 52
    Total 96 54 150
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.880 0.817 0.927
3 se 0.917 0.842 0.963
4 sp 0.815 0.686 0.907
6 diag.or 48.400 17.847 131.255

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 23 31
1 10 86
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.780 0.7051 0.843
    tp 0.640 0.5577 0.717
    se 0.896 0.8168 0.949
    sp 0.426 0.2923 0.568
    diag.ac 0.727 0.6480 0.796
    diag.or 6.381 2.7316 14.904
    nndx 3.108 1.9349 9.165
    youden 0.322 0.1091 0.517
    pv.pos 0.735 0.6455 0.812
    pv.neg 0.697 0.5129 0.844
    lr.pos 1.560 1.2279 1.983
    lr.neg 0.245 0.1260 0.475
    p.rout 0.220 0.1565 0.295
    p.rin 0.780 0.7051 0.843
    p.tpdn 0.574 0.4321 0.708
    p.tndp 0.104 0.0511 0.183
    p.dntp 0.265 0.1877 0.355
    p.dptn 0.303 0.1559 0.487
  • tab:

      Outcome + Outcome - Total
    Test + 86 31 117
    Test - 10 23 33
    Total 96 54 150
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.727 0.648 0.796
3 se 0.896 0.817 0.949
4 sp 0.426 0.292 0.568
6 diag.or 6.381 2.732 14.904


par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 15 39
1 6 90
  pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.8600 0.7940 0.911
    tp 0.6400 0.5577 0.717
    se 0.9375 0.8689 0.977
    sp 0.2778 0.1646 0.416
    diag.ac 0.7000 0.6199 0.772
    diag.or 5.7692 2.0833 15.977
    nndx 4.6452 2.5435 29.878
    youden 0.2153 0.0335 0.393
    pv.pos 0.6977 0.6106 0.775
    pv.neg 0.7143 0.4782 0.887
    lr.pos 1.2981 1.0915 1.544
    lr.neg 0.2250 0.0928 0.546
    p.rout 0.1400 0.0888 0.206
    p.rin 0.8600 0.7940 0.911
    p.tpdn 0.7222 0.5836 0.835
    p.tndp 0.0625 0.0233 0.131
    p.dntp 0.3023 0.2246 0.389
    p.dptn 0.2857 0.1128 0.522
  • tab:

      Outcome + Outcome - Total
    Test + 90 39 129
    Test - 6 15 21
    Total 96 54 150
  • method: exact

  • digits: 2

  • conf.level: 0.95

  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.700 0.620 0.772
3 se 0.938 0.869 0.977
4 sp 0.278 0.165 0.416
6 diag.or 5.769 2.083 15.977
  par(op)